if (!require("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("curatedTCGAData");
Update all/some/none? [a/s/n]:
n
BiocManager::install("TCGAutils");
Update all/some/none? [a/s/n]:
n
BiocManager::install("TCGAbiolinks");
Update all/some/none? [a/s/n]:
n
install.packages("SNFtool");
Error in install.packages : Updating loaded packages
install.packages("NetPreProc");
Error in install.packages : Updating loaded packages
library("curatedTCGAData");
library("TCGAbiolinks");
library("TCGAutils");
library("SNFtool")
library("NetPreProc");
library("cluster");
Download Prostate
adenocarcinoma dataset
assays <- c("miRNASeqGene", "RNASeq2Gene", "RPPAArray");
mo <- curatedTCGAData(diseaseCode = "PRAD",
assays = assays,
version = "2.1.1", dry.run=FALSE);
snapshotDate(): 2023-10-24
Working on: PRAD_RNASeq2Gene-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_RPPAArray-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_miRNASeqGene-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_colData-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_sampleMap-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
Working on: PRAD_metadata-20160128
see ?curatedTCGAData and browseVignettes('curatedTCGAData') for documentation
loading from cache
harmonizing input:
removing 5189 sampleMap rows not in names(experiments)
mo
A MultiAssayExperiment object of 3 listed
experiments with user-defined names and respective classes.
Containing an ExperimentList class object of length 3:
[1] PRAD_RNASeq2Gene-20160128: SummarizedExperiment with 20501 rows and 550 columns
[2] PRAD_RPPAArray-20160128: SummarizedExperiment with 195 rows and 352 columns
[3] PRAD_miRNASeqGene-20160128: SummarizedExperiment with 1046 rows and 547 columns
Functionality:
experiments() - obtain the ExperimentList instance
colData() - the primary/phenotype DataFrame
sampleMap() - the sample coordination DataFrame
`$`, `[`, `[[` - extract colData columns, subset, or experiment
*Format() - convert into a long or wide DataFrame
assays() - convert ExperimentList to a SimpleList of matrices
exportClass() - save data to flat files
Data
pre-processing
Consider only primary solid tumors Analyte information missing from
PRAD_RPPAArray (20th position) that causes inconsistent barcode lengths
in MultiArrayExperiment object mo (but doesn’t cause
problems for the code below).
primary <- TCGAutils::TCGAsampleSelect(colnames(mo), c("01"));
mo <- mo[,primary,]
Check for replicates for same patient (first 12 characters) No
replicates present in mo.
check_rep <- anyReplicated(mo);
print(check_rep)
PRAD_RNASeq2Gene-20160128 PRAD_RPPAArray-20160128 PRAD_miRNASeqGene-20160128
FALSE FALSE FALSE
Remove FFPE samples (frozen tissue is better preseved)
no_ffpe <- which(as.data.frame(colData(mo))$patient.samples.sample.is_ffpe == "no");
as.data.frame(colData(mo))
mo <- mo[, no_ffpe, ];
Consider samples having all considered omics data
complete <- intersectColumns(mo);
ExpressionList → simple list of matrices
complete <- assays(complete)
print(dim(complete[[1]]))
[1] 20501 348
Transpose all three matrices, obtaining samples \(\times\) features
complete <- lapply(complete,FUN=t)
Remove features having missing values (no missing values in the
current dataset)
for (i in 1:length(complete)){
#print(dim(complete[[i]]))
complete[[i]] <- complete[[i]][,colSums(is.na(complete[[i]]))==0]
#print(dim(complete[[i]]))
}
Select features having more variance across samples because they
bring more information.
nf <- 100;
for(i in 1:length(complete)){
idx <- caret::nearZeroVar(complete[[i]])
message(paste("Removed ", length(idx), "features from", names(complete)[i]));
if(length(idx) != 0){
complete[[i]] <- complete[[i]][, -idx];
}
if(ncol(complete[[i]]) <= nf) next
vars <- apply(complete[[i]], 2, var);
idx <- sort(vars, index.return=TRUE, decreasing = TRUE)$ix;
complete[[i]] <- complete[[i]][, idx[1:nf]];
}
Removed 1334 features from PRAD_RNASeq2Gene-20160128
Removed 0 features from PRAD_RPPAArray-20160128
Removed 471 features from PRAD_miRNASeqGene-20160128
Standardize features with z-score
zscore <- function(data){
zscore_vec <- function(x) { return ((x - mean(x)) / sd(x))}
data <- apply(data, 2, zscore_vec)
return(data)
}
complete <- lapply(complete, zscore);
Clean barcodes retaining only “Project-TSS-Participant” (first 12
characters)
for(v in 1:length(complete)){
rownames(complete[[v]]) <- substr(rownames(complete[[v]]), 1, 12);
}
Download disease
subtypes
subtypes <- as.data.frame(TCGAbiolinks::PanCancerAtlas_subtypes())
subtypes <- subtypes[subtypes$cancer.type == "PRAD", ];
# Retain only primary solid tumors and select samples in common with omics data
# (in the same order):
subtypes <- subtypes[TCGAutils::TCGAsampleSelect(subtypes$pan.samplesID, "01"), ];
sub_select <- substr(subtypes$pan.samplesID,1,12) %in% rownames(complete[[1]]);
subtypes <- subtypes[sub_select, ];
We consider only samples in the omics data in complete
for which also subtype data is present in subtypes
rownames(subtypes) <- substr(subtypes$pan.samplesID, 1, 12);
for (i in 1:length(complete))
complete[[i]] <- complete[[i]][rownames(subtypes),]
# Print number of samples for each subtype:
table(subtypes$Subtype_Integrative);
1 2 3
60 83 105
Check subtype &
omics data order
count <- 0
for(i in 1:length(rownames(subtypes))){
if(rownames(subtypes)[i] != rownames(complete[[1]])[i])
count <- count + 1
}
count
[1] 0
All patients/samples are in the same order in both the datasets. #
Multi-omics data integration Similarity matrices for each data source
using exponential euclidian distance (affinity matrix).
Integration through
SNF
Integration of the matrices using Similarity Network Fusion
t number of iterations _K_ number of neighbours to
consider to compute local similarity matrix
Integration through
average
Integration through average of matrices
for (i in 1:length(W_list))
plot(W_list[[i]],xlim=c(0,0.01),ylim=c(0,0.01))



Disease subtype
discovery with PAM
Number of clusters.
Each data source
Convert similarity matrix into a distance matrix. The similarities
are normalized in the range &$ using min-max normalization before
conversion into distances.
# Convert similarity matrix into a distance matrix. The similarities
# are normalized in the range [0, 1] using min-max normalization before
# conversion into distances.
dist <- list()
D <- list()
for (i in 1:length((W_list))){
dist[[i]] <- 1 - NetPreProc::Max.Min.norm(W_list[[i]])
D[[i]] <- as.dist(dist[[i]])
}
for (i in 1:length((W_list)))
pam.res <- pam(D[[i]], k=k)
Integrated data
through mean
dist_mean <- 1 - NetPreProc::Max.Min.norm(W_int_mean);
D_mean <- as.dist(dist_mean);
pam.res <- pam(D_mean, k=k);
Integrated data
through SNF
dist_SNF <- 1 - NetPreProc::Max.Min.norm(W_int_SNF);
D_SNF <- as.dist(dist_SNF);
# Apply clustering algorithms on integrated matrix:
pam.res <- pam(D_SNF, k=k);
#Disease subtype discovery with spectral clustering (Consider
distance matrix calculated above) ## Integrated data through SNF
k <- length(unique(subtypes$Subtype_Integrative));
sc.res <- SNFtool::spectralClustering(W_int_SNF, K=k)
#to have uniform type
pam.sc.res <- pam(sc.res,k=k)
sessionInfo()
---
title: "Bioinformatics project year 2023/2024"
author: "Lucia Anna Mellini"
date:
#bibliography: references.bib
output: 
    html_notebook:
        toc: true
        number_sections: true
        toc_float: true
        theme: cerulean
        fig_caption: true
---


```{r message=FALSE, warning=FALSE}
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("curatedTCGAData");
BiocManager::install("TCGAutils");
BiocManager::install("TCGAbiolinks");

install.packages("SNFtool");
install.packages("NetPreProc");
```

```{r message=FALSE, warning=FALSE}
library("curatedTCGAData");
library("TCGAbiolinks");
library("TCGAutils");

library("SNFtool")
library("NetPreProc");
library("cluster");
```
# Download Prostate adenocarcinoma dataset#
<!-- miRNASeqGene              Gene-level log2 RPM miRNA expression values
<>RNASeq2Gene               RSEM TPM gene expression values
<>RPPAArray                 Reverse Phase Protein Array normalized protein expression values -->
```{r}

assays <- c("miRNASeqGene", "RNASeq2Gene", "RPPAArray");
mo <- curatedTCGAData(diseaseCode = "PRAD", 
                        assays = assays,
                        version = "2.1.1", dry.run=FALSE);
```

```{r message=TRUE}
mo
```
# Data pre-processing
Consider only primary solid tumors
Analyte information missing from PRAD_RPPAArray (20th position) that causes inconsistent barcode lengths in MultiArrayExperiment object  ```mo``` (but doesn't cause problems for the code below).
```{r message=FALSE, warning=FALSE}
primary <- TCGAutils::TCGAsampleSelect(colnames(mo), c("01"));
mo <- mo[,primary,]
```
Check for replicates for same patient (first 12 characters)
No replicates present in ```mo```.
```{r}
check_rep <- anyReplicated(mo);
print(check_rep)
```
Remove FFPE samples (frozen tissue is better preseved)
```{r}
no_ffpe <- which(as.data.frame(colData(mo))$patient.samples.sample.is_ffpe == "no");
as.data.frame(colData(mo))
mo <- mo[, no_ffpe, ];
```
Consider samples having all considered omics data
```{r}
complete <- intersectColumns(mo);
```
ExpressionList → simple list of matrices
```{r}
complete <- assays(complete)
print(dim(complete[[1]]))
```
Transpose all three matrices, obtaining samples $\times$ features
```{r}
complete <- lapply(complete,FUN=t)
```
Remove features having missing values (no missing values in the current dataset)
```{r}
for (i in 1:length(complete)){
  #print(dim(complete[[i]]))
  complete[[i]] <- complete[[i]][,colSums(is.na(complete[[i]]))==0]
  #print(dim(complete[[i]]))
}
```
Select features having more variance across samples because they bring more information.
```{r}
nf <- 100;
for(i in 1:length(complete)){
    
    idx <- caret::nearZeroVar(complete[[i]])
    message(paste("Removed ", length(idx), "features from", names(complete)[i]));
    if(length(idx) != 0){
        complete[[i]] <- complete[[i]][, -idx];
    }

    if(ncol(complete[[i]]) <= nf) next
    
    vars <- apply(complete[[i]], 2, var);
    idx <- sort(vars, index.return=TRUE, decreasing = TRUE)$ix;
    
    complete[[i]] <- complete[[i]][, idx[1:nf]];
    
}
```

Standardize features with z-score
```{r}
zscore <- function(data){
    
    zscore_vec <- function(x) { return ((x - mean(x)) / sd(x))}
    data <- apply(data, 2, zscore_vec)
    
    
    return(data)
}

complete <- lapply(complete, zscore);
```
Clean barcodes retaining only "Project-TSS-Participant" (first 12 characters)
```{r}
for(v in 1:length(complete)){
    rownames(complete[[v]]) <- substr(rownames(complete[[v]]), 1, 12);
}
```
# Download disease subtypes
```{r}
subtypes <- as.data.frame(TCGAbiolinks::PanCancerAtlas_subtypes())

subtypes <- subtypes[subtypes$cancer.type == "PRAD", ];
# Retain only primary solid tumors and select samples in common with omics data
# (in the same order):
subtypes <- subtypes[TCGAutils::TCGAsampleSelect(subtypes$pan.samplesID, "01"), ];
sub_select <- substr(subtypes$pan.samplesID,1,12) %in% rownames(complete[[1]]);
subtypes <- subtypes[sub_select, ];
```
We consider only samples in the omics data in ```complete``` for which also subtype data is present in ```subtypes```
```{r}
rownames(subtypes) <- substr(subtypes$pan.samplesID, 1, 12);
for (i in 1:length(complete))
  complete[[i]] <- complete[[i]][rownames(subtypes),]
# Print number of samples for each subtype:
table(subtypes$Subtype_Integrative);
```
## Check subtype & omics data order
```{r}
count <- 0
for(i in 1:length(rownames(subtypes))){
  if(rownames(subtypes)[i] != rownames(complete[[1]])[i])
     count <- count + 1
}
count
```
All patients/samples are in the same order in both the datasets.
# Multi-omics data integration
Similarity matrices for each data source using exponential euclidian distance (affinity matrix).
```{r}
W_list <- list();
for(i in 1:length(complete)){
    Dist <- (dist2(as.matrix(complete[[i]]), as.matrix(complete[[i]])))^(1/2)
    W_list[[i]] <- affinityMatrix(Dist)
}
```
## Integration through SNF
Integration of the matrices using Similarity Network Fusion
*_t_ number of iterations
*_K_ number of neighbours to consider to compute local similarity matrix

```{r}
W_int_SNF <- SNF(W_list, K=20, t=20);
```
## Integration through average
Integration through average of matrices
```{r}
W_int_mean <- Reduce('+', W_list)/length(W_list)
```
```{r}
for (i in 1:length(W_list))
    plot(W_list[[i]],xlim=c(0,0.01),ylim=c(0,0.01))
```

```{r}
plot(W_int_mean,xlim=c(0,0.01),ylim=c(0,0.01))
```
```{r}
plot(W_int_SNF,xlim=c(0,0.01),ylim=c(0,0.01))
```
# Disease subtype discovery with PAM
Number of clusters.
```{r}
k <- length(unique(subtypes$Subtype_Integrative));
```
## Each data source
Convert similarity matrix into a distance matrix. The similarities are normalized in the range &\left[0, 1\right]$ using min-max normalization before conversion into distances.
```{r}
dist <- list()
D <- list()
for (i in 1:length((W_list))){
  dist[[i]] <- 1 - NetPreProc::Max.Min.norm(W_list[[i]])
  D[[i]] <- as.dist(dist[[i]])
}
for (i in 1:length((W_list)))
  pam.res <- pam(D[[i]], k=k)
```
## Integrated data through mean
```{r}
dist_mean <- 1 - NetPreProc::Max.Min.norm(W_int_mean);
D_mean <- as.dist(dist_mean); 

pam.res <- pam(D_mean, k=k);
```
## Integrated data through SNF
```{r}
dist_SNF <- 1 - NetPreProc::Max.Min.norm(W_int_SNF);
D_SNF <- as.dist(dist_SNF); 

# Apply clustering algorithms on integrated matrix:
pam.res <- pam(D_SNF, k=k);
```
#Disease subtype discovery with spectral clustering
(Consider distance matrix calculated above)
## Integrated data through SNF
```{r}
k <- length(unique(subtypes$Subtype_Integrative));
sc.res <- SNFtool::spectralClustering(W_int_SNF, K=k)
#to have uniform type
pam.sc.res <- pam(sc.res,k=k)
```


```{r}
sessionInfo()
```

